# PRELIMINARIES

# Import libraries
import re
import pandas as pd
from transformers import pipeline
import spacy
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Load spaCy English model
nlp = spacy.load('en_core_web_sm')

# Load survey data
survey_man = pd.read_stata('survey_man.dta', convert_categoricals=False)
# Replace empty strings with missing values
survey_man = survey_man.replace(r'^\s*$', pd.NA, regex=True)
# Get variable labels for survey questions
survey_man_iter = pd.read_stata('survey_man.dta', iterator=True)
survey_man_labels = survey_man_iter.variable_labels()
questions_man = {key: value for key, value in survey_man_labels.items() if '_q' in key}

# Extract manager perception
perception_man = survey_man.loc[
    (survey_man['finished'] == 1) &
    (survey_man['man_q17'].notna()),
    ['man_id','man_q17']
].copy()

# SENTIMENT ANALYSIS

# Clean responses
def clean_text(text):
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    return text

perception_man['man_q17_clean'] = perception_man['man_q17'].apply(clean_text)

# Word count
perception_man['word_count'] = perception_man['man_q17_clean'].apply(lambda x: len(str(x).split()))

# Sentiment analysis pipeline
sentiment_pipeline = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
sentiment_results = sentiment_pipeline(perception_man['man_q17_clean'].tolist())
perception_man['sentiment_label'] = [result['label'] for result in sentiment_results]
perception_man['sentiment_num'] = perception_man['sentiment_label'].map({'POSITIVE': 0, 'NEGATIVE': 1})

# WORD CLOUDS

# Preprocess text
def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    text = ' '.join(tokens)
    return text

perception_man['man_q17_preprocess'] = perception_man['man_q17_clean'].apply(preprocess_text)

#Word cloud by sentiment
sentiments = {'pos': 0, 'neg': 1}
words = {}
wordclouds = {}

seed = 123

for suffix, num in sentiments.items():
    
    words[suffix]= ' '.join(perception_man.loc[perception_man['sentiment_num'] == num, 'man_q17_preprocess'])
    
    wordclouds[suffix] = WordCloud(
        width=800, height=640,
        max_words=10,
        random_state=seed
    ).generate(words[suffix])

    plt.axis("off")
    plt.title(f"{'Positive' if num == 0 else 'Negative'}")
    plt.imshow(wordclouds[suffix], interpolation="bilinear")
    plt.show()